/* * Copyright (C) 2014 Indeed Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except * in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either * express or implied. See the License for the specific language governing permissions and * limitations under the License. */ package com.indeed.flamdex.simple; import com.google.common.base.Charsets; import com.google.common.collect.AbstractIterator; import com.google.common.io.ByteStreams; import com.google.common.io.CountingInputStream; import com.google.common.primitives.Ints; import com.google.common.primitives.Longs; import com.indeed.util.core.sort.Quicksortable; import com.indeed.util.core.sort.Quicksortables; import com.indeed.util.io.Files; import com.indeed.util.serialization.LongSerializer; import com.indeed.util.serialization.StringSerializer; import com.indeed.flamdex.api.DocIdStream; import com.indeed.flamdex.api.FlamdexReader; import com.indeed.flamdex.api.IntTermIterator; import com.indeed.flamdex.api.StringTermIterator; import com.indeed.flamdex.reader.FlamdexMetadata; import com.indeed.flamdex.utils.FlamdexUtils; import com.indeed.flamdex.writer.FlamdexWriter; import com.indeed.flamdex.writer.IntFieldWriter; import com.indeed.flamdex.writer.StringFieldWriter; import com.indeed.lsmtree.core.Generation; import com.indeed.lsmtree.core.ImmutableBTreeIndex; import com.indeed.util.mmap.IntArray; import com.indeed.util.mmap.MMapBuffer; import it.unimi.dsi.fastutil.IndirectPriorityQueue; import it.unimi.dsi.fastutil.ints.IntArrayList; import it.unimi.dsi.fastutil.objects.ObjectHeapSemiIndirectPriorityQueue; import org.apache.log4j.Logger; import java.io.BufferedInputStream; import java.io.EOFException; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.nio.ByteOrder; import java.nio.channels.FileChannel; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.TreeSet; import java.util.UUID; /** * @author jsgroth */ public class SimpleFlamdexWriter implements FlamdexWriter { private static final Logger log = Logger.getLogger(SimpleFlamdexWriter.class); public static final int FORMAT_VERSION = 0; private static final int DOC_ID_BUFFER_SIZE = 32; private static final int BLOCK_SIZE = 64; private final String outputDirectory; private long maxDocs; private final boolean writeBTreesOnClose; private final Set<String> intFields; private final Set<String> stringFields; public SimpleFlamdexWriter(String outputDirectory, long numDocs) throws IOException { this(outputDirectory, numDocs, true, true); } public SimpleFlamdexWriter(String outputDirectory, long numDocs, boolean create) throws IOException { this(outputDirectory, numDocs, create, true); } public SimpleFlamdexWriter(String outputDirectory, long numDocs, boolean create, boolean writeBTreesOnClose) throws IOException { this.outputDirectory = outputDirectory; this.maxDocs = numDocs; this.writeBTreesOnClose = writeBTreesOnClose; if (create) { if (new File(outputDirectory).exists()) { deleteIndex(outputDirectory); } else if (!new File(outputDirectory).mkdirs()) { throw new IOException("unable to create directory at " + outputDirectory); } intFields = new HashSet<String>(); stringFields = new HashSet<String>(); } else { final FlamdexMetadata metadata = FlamdexMetadata.readMetadata(outputDirectory); if (metadata.numDocs != numDocs) { throw new IllegalArgumentException("numDocs (" + numDocs + ") does not match numDocs in existing index (" + metadata.numDocs + ")"); } intFields = new HashSet<String>(metadata.intFields); stringFields = new HashSet<String>(metadata.stringFields); } } @Override public String getOutputDirectory() { return this.outputDirectory; } public void resetMaxDocs(long numDocs) { this.maxDocs = numDocs; } @Override public IntFieldWriter getIntFieldWriter(String field) throws FileNotFoundException { return getIntFieldWriter(field, false); } public IntFieldWriter getIntFieldWriter(String field, boolean blowAway) throws FileNotFoundException { if (!blowAway && intFields.contains(field)) { throw new IllegalArgumentException("already added int field "+field); } intFields.add(field); return SimpleIntFieldWriter.open(outputDirectory, field, maxDocs, writeBTreesOnClose); } @Override public StringFieldWriter getStringFieldWriter(String field) throws FileNotFoundException { return getStringFieldWriter(field, false); } public StringFieldWriter getStringFieldWriter(String field, boolean blowAway) throws FileNotFoundException { if (!blowAway && stringFields.contains(field)) { throw new IllegalArgumentException("already added string field "+field); } stringFields.add(field); return SimpleStringFieldWriter.open(outputDirectory, field, maxDocs, writeBTreesOnClose); } @Override public void close() throws IOException { final List<String> intFieldsList = new ArrayList<String>(intFields); Collections.sort(intFieldsList); final List<String> stringFieldsList = new ArrayList<String>(stringFields); Collections.sort(stringFieldsList); final FlamdexMetadata metadata = new FlamdexMetadata((int)maxDocs, intFieldsList, stringFieldsList, FORMAT_VERSION); FlamdexMetadata.writeMetadata(outputDirectory, metadata); } public static void writeIntBTree(String directory, String intField, File btreeDir) throws IOException { final String termsFilename = Files.buildPath(directory, SimpleIntFieldWriter.getTermsFilename(intField)); if (!new File(termsFilename).exists() || new File(termsFilename).length() == 0L) return; final CountingInputStream termsList = new CountingInputStream(new BufferedInputStream(new FileInputStream(termsFilename), 65536)); try { ImmutableBTreeIndex.Writer.write(btreeDir, new AbstractIterator<Generation.Entry<Long, LongPair>>() { private long lastTerm = 0; private long lastTermDocOffset = 0L; private long lastTermFileOffset = 0L; private long key; private LongPair value; @Override protected Generation.Entry<Long, LongPair> computeNext() { try { if (!nextTerm()) return endOfData(); key = lastTerm; value = new LongPair(lastTermFileOffset, lastTermDocOffset); for (int i = 0; i < BLOCK_SIZE-1; ++i) { if (!nextTerm()) { break; } } return Generation.Entry.create(key, value); } catch (IOException e) { throw new RuntimeException(e); } } private boolean nextTerm() throws IOException { final long termDelta; //sorry try { termDelta = FlamdexUtils.readVLong(termsList); } catch (EOFException e) { return false; } lastTerm += termDelta; final long offsetDelta = FlamdexUtils.readVLong(termsList); lastTermDocOffset += offsetDelta; lastTermFileOffset = termsList.getCount(); FlamdexUtils.readVLong(termsList); // termDocFreq return true; } }, new LongSerializer(), new LongPairSerializer(), 65536, false); } finally { termsList.close(); } } public static void writeStringBTree(String directory, String stringField, File btreeDir) throws IOException { final String termsFilename = Files.buildPath(directory, SimpleStringFieldWriter.getTermsFilename(stringField)); if (!new File(termsFilename).exists() || new File(termsFilename).length() == 0L) return; final CountingInputStream termsList = new CountingInputStream(new BufferedInputStream(new FileInputStream(termsFilename), 65536)); try { ImmutableBTreeIndex.Writer.write(btreeDir, new AbstractIterator<Generation.Entry<String, LongPair>>() { private String key; private LongPair value; private byte[] lastTerm = new byte[10]; private int lastTermLen = 0; private long lastTermDocOffset = 0L; private long lastTermFileOffset = 0L; @Override public Generation.Entry<String, LongPair> computeNext() { try { if (!nextTerm()) return endOfData(); key = new String(lastTerm, 0, lastTermLen, Charsets.UTF_8); value = new LongPair(lastTermFileOffset, lastTermDocOffset); for (int i = 0; i < BLOCK_SIZE - 1; ++i) { if (!nextTerm()) { break; } } return Generation.Entry.create(key, value); } catch (IOException e) { throw new RuntimeException(e); } } private boolean nextTerm() throws IOException { final int removeLen; //sorry try { removeLen = (int) FlamdexUtils.readVLong(termsList); } catch (EOFException e) { return false; } final int newLen = (int)FlamdexUtils.readVLong(termsList); lastTerm = ensureCapacity(lastTerm, lastTermLen - removeLen + newLen); ByteStreams.readFully(termsList, lastTerm, lastTermLen - removeLen, newLen); lastTermLen = lastTermLen - removeLen + newLen; final long offsetDelta = FlamdexUtils.readVLong(termsList); lastTermDocOffset += offsetDelta; lastTermFileOffset = termsList.getCount(); FlamdexUtils.readVLong(termsList); // termDocFreq return true; } }, new StringSerializer(), new LongPairSerializer(), 65536, false); } finally { termsList.close(); } } private static byte[] ensureCapacity(final byte[] a, final int capacity) { return capacity <= a.length ? a : Arrays.copyOf(a, Math.max(2*a.length, capacity)); } public static void writeFlamdex(final FlamdexReader fdx, final FlamdexWriter w) throws IOException { final DocIdStream dis = fdx.getDocIdStream(); final int[] docIdBuf = new int[DOC_ID_BUFFER_SIZE]; for (final String intField : fdx.getIntFields()) { final IntFieldWriter ifw = w.getIntFieldWriter(intField); final IntTermIterator iter = fdx.getIntTermIterator(intField); while (iter.next()) { ifw.nextTerm(iter.term()); dis.reset(iter); while (true) { final int n = dis.fillDocIdBuffer(docIdBuf); for (int i = 0; i < n; ++i) { ifw.nextDoc(docIdBuf[i]); } if (n < docIdBuf.length) break; } } iter.close(); ifw.close(); } for (final String stringField : fdx.getStringFields()) { final StringFieldWriter sfw = w.getStringFieldWriter(stringField); final StringTermIterator iter = fdx.getStringTermIterator(stringField); while (iter.next()) { sfw.nextTerm(iter.term()); dis.reset(iter); while (true) { final int n = dis.fillDocIdBuffer(docIdBuf); for (int i = 0; i < n; ++i) { sfw.nextDoc(docIdBuf[i]); } if (n < docIdBuf.length) break; } } iter.close(); sfw.close(); } dis.close(); w.close(); } public static void merge(Collection<? extends FlamdexReader> readers, FlamdexWriter w) throws IOException { merge(readers.toArray(new FlamdexReader[readers.size()]), w); } public static void merge(FlamdexReader[] readers, FlamdexWriter w) throws IOException { final DocIdStream[] docIdStreams = new DocIdStream[readers.length]; final int[] segmentStartDocs = new int[readers.length]; int totalNumDocs = 0; for (int i = 0; i < readers.length; ++i) { docIdStreams[i] = readers[i].getDocIdStream(); segmentStartDocs[i] = totalNumDocs; totalNumDocs += readers[i].getNumDocs(); } log.info("merging " + readers.length + " readers with a total of " + totalNumDocs + " docs"); final int[] indexBuf = new int[readers.length]; final int[] docIdBuf = new int[64]; for (final String intField : mergeIntFields(readers)) { final IntFieldWriter ifw = w.getIntFieldWriter(intField); final IntTermIteratorWrapper[] iterators = new IntTermIteratorWrapper[readers.length]; final IndirectPriorityQueue<IntTermIteratorWrapper> pq = new ObjectHeapSemiIndirectPriorityQueue<IntTermIteratorWrapper>(iterators, iterators.length); for (int i = 0; i < readers.length; ++i) { if (!readers[i].getIntFields().contains(intField)) continue; final IntTermIterator it = readers[i].getIntTermIterator(intField); if (it.next()) { iterators[i] = new IntTermIteratorWrapper(it, i); pq.enqueue(i); } else { it.close(); } } while (!pq.isEmpty()) { final long term = iterators[pq.first()].it.term(); int numIndexes = 0; IntTermIteratorWrapper wrap; while (!pq.isEmpty() && (wrap = iterators[pq.first()]).it.term() == term) { final int index = wrap.index; docIdStreams[index].reset(wrap.it); indexBuf[numIndexes++] = index; if (wrap.it.next()) { pq.changed(); } else { wrap.it.close(); pq.dequeue(); } } ifw.nextTerm(term); for (int i = 0; i < numIndexes; ++i) { final int index = indexBuf[i]; final int startDoc = segmentStartDocs[index]; final DocIdStream dis = docIdStreams[index]; while (true) { final int n = dis.fillDocIdBuffer(docIdBuf); for (int j = 0; j < n; ++j) { ifw.nextDoc(docIdBuf[j]+startDoc); } if (n < docIdBuf.length) break; } } } ifw.close(); } for (final String stringField : mergeStringFields(readers)) { final StringFieldWriter sfw = w.getStringFieldWriter(stringField); final StringTermIteratorWrapper[] iterators = new StringTermIteratorWrapper[readers.length]; final IndirectPriorityQueue<StringTermIteratorWrapper> pq = new ObjectHeapSemiIndirectPriorityQueue<StringTermIteratorWrapper>(iterators, iterators.length); for (int i = 0; i < readers.length; ++i) { if (!readers[i].getStringFields().contains(stringField)) continue; final StringTermIterator it = readers[i].getStringTermIterator(stringField); if (it.next()) { iterators[i] = new StringTermIteratorWrapper(it, i); pq.enqueue(i); } else { it.close(); } } while (!pq.isEmpty()) { final String term = iterators[pq.first()].it.term(); int numIndexes = 0; StringTermIteratorWrapper wrap; while (!pq.isEmpty() && (wrap = iterators[pq.first()]).it.term().equals(term)) { final int index = wrap.index; docIdStreams[index].reset(wrap.it); indexBuf[numIndexes++] = index; if (wrap.it.next()) { pq.changed(); } else { wrap.it.close(); pq.dequeue(); } } sfw.nextTerm(term); for (int i = 0; i < numIndexes; ++i) { final int index = indexBuf[i]; final int startDoc = segmentStartDocs[index]; final DocIdStream dis = docIdStreams[index]; while (true) { final int n = dis.fillDocIdBuffer(docIdBuf); for (int j = 0; j < n; ++j) { sfw.nextDoc(docIdBuf[j]+startDoc); } if (n < docIdBuf.length) break; } } } sfw.close(); } for (final DocIdStream dis : docIdStreams) { dis.close(); } } private static Set<String> mergeIntFields(FlamdexReader[] readers) { final Set<String> ret = new TreeSet<String>(); for (final FlamdexReader reader : readers) { ret.addAll(reader.getIntFields()); } return ret; } private static Set<String> mergeStringFields(FlamdexReader[] readers) { final Set<String> ret = new TreeSet<String>(); for (final FlamdexReader reader : readers) { ret.addAll(reader.getStringFields()); } return ret; } private static final class IntTermIteratorWrapper implements Comparable<IntTermIteratorWrapper> { private final IntTermIterator it; private final int index; private IntTermIteratorWrapper(IntTermIterator it, int index) { this.it = it; this.index = index; } @Override public int compareTo(IntTermIteratorWrapper o) { final int cmp; return (cmp = Longs.compare(it.term(), o.it.term())) != 0 ? cmp : Ints.compare(index, o.index); } } private static final class StringTermIteratorWrapper implements Comparable<StringTermIteratorWrapper> { private final StringTermIterator it; private final int index; private StringTermIteratorWrapper(StringTermIterator it, int index) { this.it = it; this.index = index; } @Override public int compareTo(StringTermIteratorWrapper o) { final int c = it.term().compareTo(o.it.term()); return c != 0 ? c : index - o.index; } } public static void addField(String dir, String fieldName, FlamdexReader r, final long[] cache) throws IOException { final File tempFile = new File(dir, "temp-" + fieldName + "-" + UUID.randomUUID() + ".intarray.bin"); try { final MMapBuffer buffer = new MMapBuffer(tempFile, 0, 4 * cache.length, FileChannel.MapMode.READ_WRITE, ByteOrder.nativeOrder()); try { final IntArray indices = buffer.memory().intArray(0, cache.length); for (int i = 0; i < cache.length; ++i) { indices.set(i, i); } log.debug("sorting"); Quicksortables.sort(new Quicksortable() { @Override public void swap(int i, int j) { final int t = indices.get(i); indices.set(i, indices.get(j)); indices.set(j, t); } @Override public int compare(int i, int j) { final long ii = cache[indices.get(i)]; final long ij = cache[indices.get(j)]; return ii < ij ? -1 : ii > ij ? 1 : indices.get(i) < indices.get(j) ? -1 : indices.get(i) > indices.get(j) ? 1 : 0; } }, cache.length); log.debug("writing field " + fieldName); final SimpleFlamdexWriter w = new SimpleFlamdexWriter(dir, r.getNumDocs(), false); final IntFieldWriter ifw = w.getIntFieldWriter(fieldName, true); long prev = 0; boolean prevInitialized = false; for (int i = 0; i < cache.length; ++i) { final long cur = cache[indices.get(i)]; if (!prevInitialized || cur != prev) { ifw.nextTerm(cur); prev = cur; prevInitialized = true; } ifw.nextDoc(indices.get(i)); } ifw.close(); w.close(); } finally { try { buffer.close(); } catch (IOException e) { log.error("error closing MMapBuffer", e); } } } finally { if (!tempFile.delete()) { log.warn("unable to delete temp file " + tempFile); } } } public static void addField(String indexDir, String newFieldName, FlamdexReader docReader, final String[] values) throws IOException { final int[] indices = new int[docReader.getNumDocs()]; for (int i = 0; i < indices.length; i++) { indices[i] = i; } log.debug("sorting"); Quicksortables.sort(new Quicksortable() { @Override public void swap(int i, int j) { Quicksortables.swap(indices, i, j); } @Override public int compare(int i, int j) { // Sorting logic: Primarily by value (String), secondarily by document ID (indices[i]) final String left = values[indices[i]]; final String right = values[indices[j]]; if (left.compareTo(right) < 0) { return -1; } else if (left.compareTo(right) > 0) { return 1; } else { // left == right if (indices[i] < indices[j]) { return -1; } else if (indices[i] > indices[j]) { return 1; } else { return 0; // Both value & doc ID match } } } }, values.length); log.debug("writing field " + newFieldName); final SimpleFlamdexWriter w = new SimpleFlamdexWriter(indexDir, docReader.getNumDocs(), false); final StringFieldWriter sfw = w.getStringFieldWriter(newFieldName, true); final IntArrayList docList = new IntArrayList(); docList.add(indices[0]); for (int i = 1; i < indices.length; ++i) { final String prev = values[indices[i - 1]]; final String cur = values[indices[i]]; if (cur.compareTo(prev) != 0) { sfw.nextTerm(prev); for (int j = 0; j < docList.size(); ++j) { sfw.nextDoc(docList.getInt(j)); } docList.clear(); } docList.add(indices[i]); } if (docList.size() > 0) { sfw.nextTerm(values[indices[indices.length - 1]]); for (int j = 0; j < docList.size(); ++j) { sfw.nextDoc(docList.getInt(j)); } } sfw.close(); w.close(); } public static void deleteIndex(final String dir) throws IOException { final File[] files = new File(dir).listFiles(new SimpleFlamdexFileFilter()); if (files != null) { for (final File f : files) { if (f.isDirectory()) { for (final File sub : f.listFiles()) { if (!sub.delete()) { throw new IOException("unable to delete file in index sub directory: " + sub.getAbsolutePath()); } } } if (!f.delete()) { throw new IOException("unable to delete file in index directory: " + f.getAbsolutePath()); } } } } }